2. Look at the data
- Inspect the dimensions,
head, and tail of
the data
- Inspect the structure of each variables. Are they in the correct
format?
dim(cv_states)
## [1] 51178 9
head(cv_states)
## state date fips cases deaths geo_id population pop_density abb
## 1 Alabama 2020-11-02 1 194892 2973 1 4887871 96.50939 AL
## 2 Alabama 2021-07-17 1 559478 11443 1 4887871 96.50939 AL
## 3 Alabama 2021-07-06 1 551298 11358 1 4887871 96.50939 AL
## 4 Alabama 2022-02-24 1 1276580 18102 1 4887871 96.50939 AL
## 5 Alabama 2020-08-01 1 89349 1603 1 4887871 96.50939 AL
## 6 Alabama 2022-05-07 1 1302397 19601 1 4887871 96.50939 AL
tail(cv_states)
## state date fips cases deaths geo_id population pop_density abb
## 51173 Wyoming 2021-12-05 56 111812 1428 56 577737 5.950611 WY
## 51174 Wyoming 2022-06-11 56 159707 1824 56 577737 5.950611 WY
## 51175 Wyoming 2022-11-01 56 178866 1914 56 577737 5.950611 WY
## 51176 Wyoming 2021-12-31 56 115638 1526 56 577737 5.950611 WY
## 51177 Wyoming 2021-10-11 56 95137 1041 56 577737 5.950611 WY
## 51178 Wyoming 2021-10-19 56 98567 1136 56 577737 5.950611 WY
str(cv_states)
## 'data.frame': 51178 obs. of 9 variables:
## $ state : chr "Alabama" "Alabama" "Alabama" "Alabama" ...
## $ date : IDate, format: "2020-11-02" "2021-07-17" ...
## $ fips : int 1 1 1 1 1 1 1 1 1 1 ...
## $ cases : int 194892 559478 551298 1276580 89349 1302397 1524549 1259580 547135 546540 ...
## $ deaths : int 2973 11443 11358 18102 1603 19601 20423 17505 11252 11220 ...
## $ geo_id : int 1 1 1 1 1 1 1 1 1 1 ...
## $ population : int 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 ...
## $ pop_density: num 96.5 96.5 96.5 96.5 96.5 ...
## $ abb : chr "AL" "AL" "AL" "AL" ...
3. Format the data
# format the date
cv_states$date <- as.Date(cv_states$date, format="%Y-%m-%d")
# format the state and state abbreviation (abb) variables
state_list <- unique(cv_states$state)
cv_states$state <- factor(cv_states$state, levels = state_list)
abb_list <- unique(cv_states$abb)
cv_states$abb <- factor(cv_states$abb, levels = abb_list)
# order the data first by state, second by date
cv_states = cv_states[order(cv_states$state, cv_states$date),]
# Confirm the variables are now correctly formatted
str(cv_states)
## 'data.frame': 51178 obs. of 9 variables:
## $ state : Factor w/ 52 levels "Alabama","Alaska",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ date : Date, format: "2020-03-13" "2020-03-14" ...
## $ fips : int 1 1 1 1 1 1 1 1 1 1 ...
## $ cases : int 6 12 23 29 39 51 78 106 131 157 ...
## $ deaths : int 0 0 0 0 0 0 0 0 0 0 ...
## $ geo_id : int 1 1 1 1 1 1 1 1 1 1 ...
## $ population : int 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 ...
## $ pop_density: num 96.5 96.5 96.5 96.5 96.5 ...
## $ abb : Factor w/ 52 levels "AL","AK","AZ",..: 1 1 1 1 1 1 1 1 1 1 ...
head(cv_states)
## state date fips cases deaths geo_id population pop_density abb
## 769 Alabama 2020-03-13 1 6 0 1 4887871 96.50939 AL
## 574 Alabama 2020-03-14 1 12 0 1 4887871 96.50939 AL
## 830 Alabama 2020-03-15 1 23 0 1 4887871 96.50939 AL
## 315 Alabama 2020-03-16 1 29 0 1 4887871 96.50939 AL
## 612 Alabama 2020-03-17 1 39 0 1 4887871 96.50939 AL
## 421 Alabama 2020-03-18 1 51 0 1 4887871 96.50939 AL
tail(cv_states)
## state date fips cases deaths geo_id population pop_density abb
## 50415 Wyoming 2022-11-05 56 178866 1914 56 577737 5.950611 WY
## 50836 Wyoming 2022-11-06 56 178866 1914 56 577737 5.950611 WY
## 50530 Wyoming 2022-11-07 56 178866 1914 56 577737 5.950611 WY
## 51159 Wyoming 2022-11-08 56 179366 1917 56 577737 5.950611 WY
## 51162 Wyoming 2022-11-09 56 179366 1917 56 577737 5.950611 WY
## 50939 Wyoming 2022-11-10 56 179366 1917 56 577737 5.950611 WY
# Inspect range of values for each variable. What is the date range? The range of cases and deaths?
head(cv_states)
## state date fips cases deaths geo_id population pop_density abb
## 769 Alabama 2020-03-13 1 6 0 1 4887871 96.50939 AL
## 574 Alabama 2020-03-14 1 12 0 1 4887871 96.50939 AL
## 830 Alabama 2020-03-15 1 23 0 1 4887871 96.50939 AL
## 315 Alabama 2020-03-16 1 29 0 1 4887871 96.50939 AL
## 612 Alabama 2020-03-17 1 39 0 1 4887871 96.50939 AL
## 421 Alabama 2020-03-18 1 51 0 1 4887871 96.50939 AL
summary(cv_states)
## state date fips cases
## Washington : 1025 Min. :2020-01-21 Min. : 1.00 Min. : 1
## Illinois : 1022 1st Qu.:2020-11-02 1st Qu.:16.00 1st Qu.: 89020
## California : 1021 Median :2021-07-06 Median :29.00 Median : 342322
## Arizona : 1020 Mean :2021-07-06 Mean :29.78 Mean : 813572
## Massachusetts: 1014 3rd Qu.:2022-03-09 3rd Qu.:44.00 3rd Qu.: 957840
## Wisconsin : 1010 Max. :2022-11-10 Max. :72.00 Max. :11411718
## (Other) :45066
## deaths geo_id population pop_density
## Min. : 0 Min. : 1.00 Min. : 577737 Min. : 1.292
## 1st Qu.: 1367 1st Qu.:16.00 1st Qu.: 1805832 1st Qu.: 43.659
## Median : 5114 Median :29.00 Median : 4468402 Median : 107.860
## Mean :11371 Mean :29.78 Mean : 6404024 Mean : 422.944
## 3rd Qu.:14391 3rd Qu.:44.00 3rd Qu.: 7535591 3rd Qu.: 229.511
## Max. :97142 Max. :72.00 Max. :39557045 Max. :11490.120
## NA's :973
## abb
## WA : 1025
## IL : 1022
## CA : 1021
## AZ : 1020
## MA : 1014
## WI : 1010
## (Other):45066
min(cv_states$date)
## [1] "2020-01-21"
max(cv_states$date)
## [1] "2022-11-10"
# Add variables for new_cases and new_deaths
for (i in 1:length(state_list)) {
cv_subset = subset(cv_states, state == state_list[i])
cv_subset = cv_subset[order(cv_subset$date),]
# add starting level for new cases and deaths
cv_subset$new_cases = cv_subset$cases[1]
cv_subset$new_deaths = cv_subset$deaths[1]
for (j in 2:nrow(cv_subset)) {
cv_subset$new_cases[j] = cv_subset$cases[j] - cv_subset$cases[j-1]
cv_subset$new_deaths[j] = cv_subset$deaths[j] - cv_subset$deaths[j-1]
}
# include in main dataset
cv_states$new_cases[cv_states$state==state_list[i]] = cv_subset$new_cases
cv_states$new_deaths[cv_states$state==state_list[i]] = cv_subset$new_deaths
}
# Focus on recent dates
cv_states <- cv_states %>% dplyr::filter(date >= "2022-06-01")
p1<-ggplot(cv_states, aes(x = date, y = new_cases, color = state)) + geom_line() + geom_point(size = .5, alpha = 0.5)
ggplotly(p1)
p1<-NULL # to clear from workspace
p2<-ggplot(cv_states, aes(x = date, y = new_deaths, color = state)) + geom_line() + geom_point(size = .5, alpha = 0.5)
ggplotly(p2)
p2<-NULL # to clear from workspace
# set negative new case or death counts to 0
cv_states$new_cases[cv_states$new_cases<0] = 0
cv_states$new_deaths[cv_states$new_deaths<0] = 0
# Recalculate `cases` and `deaths` as cumulative sum of updated `new_cases` and `new_deaths`
for (i in 1:length(state_list)) {
cv_subset = subset(cv_states, state == state_list[i])
# add starting level for new cases and deaths
cv_subset$cases = cv_subset$cases[1]
cv_subset$deaths = cv_subset$deaths[1]
### FINISH CODE HERE
for (j in 2:nrow(cv_subset)) {
cv_subset$cases[j] = cv_subset$new_cases[j] + cv_subset$cases[j-1]
cv_subset$deaths[j] = cv_subset$new_deaths[j] + cv_subset$deaths[j-1]
}
# include in main dataset
cv_states$cases[cv_states$state==state_list[i]] = cv_subset$cases
cv_states$deaths[cv_states$state==state_list[i]] = cv_subset$deaths
}
library(zoo)
# Smooth new counts
cv_states$new_cases = zoo::rollmean(cv_states$new_cases, k=7, fill=NA, align='right') %>% round(digits = 0)
cv_states$new_deaths = zoo::rollmean(cv_states$new_deaths, k=7, fill=NA, align='right') %>% round(digits = 0)
# Inspect data again interactively
p2<-ggplot(cv_states, aes(x = date, y = new_deaths, color = state)) + geom_line() + geom_point(size = .5, alpha = 0.5)
ggplotly(p2)
p2=NULL
5. Add additional variables
# add population normalized (by 100,000) counts for each variable
cv_states$per100k = as.numeric(format(round(cv_states$cases/(cv_states$population/100000),1),nsmall=1))
cv_states$newper100k = as.numeric(format(round(cv_states$new_cases/(cv_states$population/100000),1),nsmall=1))
cv_states$deathsper100k = as.numeric(format(round(cv_states$deaths/(cv_states$population/100000),1),nsmall=1))
cv_states$newdeathsper100k = as.numeric(format(round(cv_states$new_deaths/(cv_states$population/100000),1),nsmall=1))
# add a naive_CFR variable = deaths / cases
cv_states = cv_states %>% mutate(naive_CFR = round((deaths*100/cases),2))
# create a `cv_states_today` dataset
cv_states_today = subset(cv_states, date==max(cv_states$date))